What does one of the world’s most famous bikeshares look like, on one of its city’s wildest nights? In this project, I track usage of the Paris Vélib system from 6 PM on Bastille day through to 6 AM the next morning. To do so, I leverage the excellent open data system that the Paris Mayor’s Office has put in place.
#Getting data on where stations are
station_data2<- fromJSON("https://velib-metropole-opendata.smoove.pro/opendata/Velib_Metropole/station_information.json")
station_data <- as_tibble(station_data2$data$stations)
station_data <- station_data %>%
select(-stationCode, -rental_methods)
write.csv(station_data, "nyt_climate_data/station_data.csv")
As mentioned, I ran this code over a specific timeframe. I have therefore opted to not have this code evaluated.
#Creating a function that takes a number of times for the machine to collect data, as well as the rest time in between in seconds
velib_night_function <- function(i, s){
message(Sys.time())
json <- fromJSON(content(GET("https://velib-metropole-opendata.smoove.pro/opendata/Velib_Metropole/station_status.json"), "text"))
velib_availab1 <- (json)
velib_prev <- velib_availab1$data$stations
velib_prev <- velib_prev %>%
select(station_id, num_bikes_available, num_docks_available, is_installed, is_renting, is_returning)
velib_prev <- velib_prev %>%
filter(is_installed==1 & is_returning==1 & is_renting==1)
velib_prev <- velib_prev %>%
select(-is_installed, -is_renting, -is_returning)
velib_prev <- rename(velib_prev, "num_bikes_available_prev"=num_bikes_available, "num_docks_available_prev"=num_docks_available)
Sys.sleep(s)
a=1
while (a<=i){
json <- fromJSON(content(GET("https://velib-metropole-opendata.smoove.pro/opendata/Velib_Metropole/station_status.json"), "text"))
velib_availab1 <- (json)
velib_now <- velib_availab1$data$stations
velib_now <- velib_now %>%
select(station_id, num_bikes_available, num_docks_available, is_installed, is_renting, is_returning)
velib_now <- velib_now %>%
filter(is_installed==1 & is_returning==1 & is_renting==1)
velib_now <- velib_now %>%
select(-is_installed, -is_renting, -is_returning)
#Generate comparison
velib_comp <- left_join(velib_prev, velib_now, by="station_id")
velib_comp$docks_avail_change <- abs(velib_comp$num_docks_available_prev-velib_comp$num_docks_available)
velib_comp$bikes_avail_change <- abs(velib_comp$num_bikes_available_prev-velib_comp$num_bikes_available)
velib_comp <- velib_comp %>%
select(station_id, docks_avail_change, bikes_avail_change)
#Adding a time tracker for the time, in Paris (one hour ahead of my system time)
velib_comp$time_track<- (Sys.time()+hours(1))
#Put just the changes into a new dataframe
velib_night_tracker <- rbind(velib_night_tracker, velib_comp)
#Make the now into prev
velib_prev <-velib_now
velib_prev <- rename(velib_prev, "num_bikes_available_prev"=num_bikes_available, "num_docks_available_prev"=num_docks_available)
message(Sys.time())
a<-a+1
Sys.sleep(s)
}
return(velib_night_tracker)
}
velib_night_tracker <- data.frame()
#Takes 2 seconds to do, so 58 seconds in between instead of 60
velib_night <- velib_night_function(1500, 58)
write.csv(velib_night, "nyt_climate_data/velib_night.csv")
velib_night <-read.csv("nyt_climate_data/velib_night.csv")
docks_df <- velib_night%>%
group_by(station_id)%>%
summarise(docks_change=sum(docks_avail_change))
bikes_df <- velib_night %>%
group_by(station_id) %>%
summarise(bikes_change = sum(bikes_avail_change))
velib_change <- left_join(docks_df, bikes_df, by="station_id")
write.csv(velib_change, "nyt_climate_data/velib_change.csv")
bikes_df %>%
arrange(desc(bikes_change))%>%
head()
## # A tibble: 6 × 2
## station_id bikes_change
## <dbl> <int>
## 1 394404659 499
## 2 39149651 464
## 3 66491397 456
## 4 128884445 410
## 5 476155906 399
## 6 100923502 393
docks_df %>%
arrange(desc(docks_change))%>%
head()
## # A tibble: 6 × 2
## station_id docks_change
## <dbl> <int>
## 1 394404659 474
## 2 39149651 468
## 3 66491397 452
## 4 128884445 411
## 5 49423507 390
## 6 100923502 390
velib_change <- read.csv("nyt_climate_data/velib_change.csv")
velib_change <- velib_change %>%
select(-X)
station_data <- read.csv("nyt_climate_data/station_data.csv")
station_data <- station_data %>%
select(-X)
velib_night <- read.csv("nyt_climate_data/velib_night.csv")
velib_night <- velib_night %>%
select(-X)
velib_change <- rename(velib_change, "change_station_id"=station_id)
velib_night <- rename(velib_night, "night_station_id"=station_id)
db <- dbConnect(RSQLite::SQLite(), "nyt_climate_data/velib.sqlite")
dbWriteTable(db, "velib_change", velib_change, overwrite=TRUE)
dbWriteTable(db, "station_data", station_data, overwrite=TRUE)
dbWriteTable(db, "velib_night", velib_night, overwrite=TRUE)
dbGetQuery(db, "SELECT *
FROM station_Data
LIMIT 5
")
## station_id name lat lon capacity
## 1 213688169 Benjamin Godard - Victor Hugo 48.86598 2.275725 35
## 2 653222953 Mairie de Rosny-sous-Bois 48.87126 2.486581 30
## 3 36255 Toudouze - Clauzel 48.87930 2.337360 21
## 4 37815204 Mairie du 12ème 48.84086 2.387555 30
## 5 17486274358 Jean Rostand - Paul Vaillant Couturier 48.90817 2.453060 0
dbGetQuery(db, "SELECT COUNT(*)
FROM station_Data
LIMIT 5
")
## COUNT(*)
## 1 1465
dbGetQuery(db, "SELECT *
FROM velib_change
LIMIT 5
")
## change_station_id docks_change bikes_change
## 1 6245 96 95
## 2 6293 65 65
## 3 6294 149 149
## 4 6295 117 117
## 5 6296 132 126
dbGetQuery(db, "SELECT COUNT(*)
FROM velib_change
LIMIT 5
")
## COUNT(*)
## 1 1419
dbGetQuery(db, "SELECT *
FROM velib_night
LIMIT 5
")
## night_station_id docks_avail_change bikes_avail_change time_track
## 1 213688169 0 0 2023-07-14 13:00:17
## 2 653222953 0 0 2023-07-14 13:00:17
## 3 36255 0 0 2023-07-14 13:00:17
## 4 37815204 1 1 2023-07-14 13:00:17
## 5 251039991 0 0 2023-07-14 13:00:17
dbGetQuery(db, "SELECT COUNT(*)
FROM velib_night
LIMIT 5
")
## COUNT(*)
## 1 2110822
Since a small number of stations weren’t in service on Saturday night, the number of rows in tables 1 and 2 are slightly different.
dbGetQuery(db, "SELECT *
FROM station_data
LEFT JOIN (SELECT *
FROM velib_change)
ON station_id=change_station_id
ORDER BY bikes_change DESC
LIMIT 5
")
## station_id name lat lon capacity
## 1 394404659 Gare Saint-Lazare - Cour du Havre 48.87567 2.326560 45
## 2 39149651 Richard Lenoir - Place de la Bastille 48.85383 2.369722 53
## 3 66491397 Gare de Lyon - Place Louis Armand 48.84552 2.372650 60
## 4 128884445 Constantine - Université 48.86068 2.314848 65
## 5 476155906 Saint-Antoine Sévigné 48.85502 2.361232 26
## change_station_id docks_change bikes_change
## 1 394404659 474 499
## 2 39149651 468 464
## 3 66491397 452 456
## 4 128884445 411 410
## 5 476155906 389 399
dbGetQuery(db, "SELECT COUNT(*)
FROM station_data
LEFT JOIN (SELECT *
FROM velib_change)
ON station_id=change_station_id
ORDER BY bikes_change DESC
")
## COUNT(*)
## 1 1465
dbGetQuery(db, "SELECT *
FROM velib_night
LEFT JOIN (SELECT *
FROM station_data)
ON night_station_id=station_id
LIMIT 5
")
## night_station_id docks_avail_change bikes_avail_change time_track
## 1 213688169 0 0 2023-07-14 13:00:17
## 2 653222953 0 0 2023-07-14 13:00:17
## 3 36255 0 0 2023-07-14 13:00:17
## 4 37815204 1 1 2023-07-14 13:00:17
## 5 251039991 0 0 2023-07-14 13:00:17
## station_id name lat lon capacity
## 1 213688169 Benjamin Godard - Victor Hugo 48.86598 2.275725 35
## 2 653222953 Mairie de Rosny-sous-Bois 48.87126 2.486581 30
## 3 36255 Toudouze - Clauzel 48.87930 2.337360 21
## 4 37815204 Mairie du 12ème 48.84086 2.387555 30
## 5 251039991 Cassini - Denfert-Rochereau 48.83753 2.336035 25
dbGetQuery(db, "SELECT COUNT(*)
FROM velib_night
LEFT JOIN (SELECT *
FROM station_data)
ON night_station_id=station_id
LIMIT 5
")
## COUNT(*)
## 1 2110822
This highlights the locations of the top 100 stations that night.
top_100 <- dbGetQuery(db, "SELECT *
FROM station_Data
LEFT JOIN (SELECT change_station_id, bikes_change
FROM velib_change)
ON station_id=change_station_id
ORDER BY bikes_change DESC
LIMIT 100
")
bbox_paris <- c(2.2, 48.80, 2.5, 48.92)
map <- get_stamenmap(bbox_paris, zoom=12, maptype="terrain")
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
#ggmap(map)
ggmap(map)+
geom_point(data=top_100, mapping=aes(x=lon, y=lat, color=bikes_change))+
scale_color_distiller(palette = "YlOrRd", direction = 1)+
labs(title="Map of the top 100 Vélib stations with highest number of bike changes, \nJuly 14-15",
caption="Source: Vélib API \nMap tiles by Stamen Design, under CC BY 3.0. \nMap data by OpenStreetMap, under ODbL",
color="Bike changes")+
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank())
As one can see, most of the stations that got the most use were downtown and in the East of Paris – which makes sense, this is where the nightlife is, and where people are more likely to bike to get around, as opposed to drive.
Whereas this demonstrates the location of the stations that were not touched once in that period.
zero <- dbGetQuery(db, "SELECT *
FROM station_Data
RIGHT JOIN (SELECT change_station_id, bikes_change
FROM velib_change
WHERE bikes_change==0)
ON station_id=change_station_id
")
ggmap(map)+
geom_point(data=zero, mapping=aes(x=lon, y=lat))+
labs(title="Map of the Vélib stations that saw no bike moves, \nSat Jan 14 22:30- Sun Jan 15 1:00",
caption="Source: Vélib API, Open Street Maps",
color="Bike moves")+
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank())
There are stations with not usage scattered a bit around
everywhere, but mainly in the Paris periphery (an additional 42 stations
didn’t even make it onto the map).
This next, slightly overwhelming charts presents the data for all the stations in the network (minus a few that don’t fit on my map).
everything <- dbGetQuery(db, "SELECT *
FROM station_Data
LEFT JOIN (SELECT change_station_id, bikes_change
FROM velib_change)
ON station_id=change_station_id
")
ggmap(map)+
geom_point(data=everything, mapping=aes(x=lon, y=lat, color=bikes_change))+
scale_color_distiller(palette = "YlOrRd", direction = 1)+
labs(title="Map of the Vélib stations by number of bike moves, \nSat Jan 14 22:30- Sun Jan 15 1:00",
caption="Source: Vélib API, Open Street Maps",
color="Bike moves")+
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank())
## Warning: Removed 100 rows containing missing values (`geom_point()`).
We can also use the data on the top 20 bike stations, to see what the change patterns looked like over a night.
p<- dbGetQuery(db, "SELECT *
FROM velib_night
WHERE night_station_id IN(SELECT change_station_id
FROM velib_change
ORDER BY docks_change DESC
LIMIT 100)
")
summary(p$docks_avail_change)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2102 0.0000 20.0000
p$time_track <- ymd_hms(p$time_track)
p$hour <- hour(p$time_track)
p2 <- p
p3 <- p2 %>%
group_by(night_station_id, hour) %>%
summarise(sum=sum(docks_avail_change))
## `summarise()` has grouped output by 'night_station_id'. You can override using
## the `.groups` argument.
p3$hour <- paste0(as.character(p3$hour), ":00")
color="#003f5c"
ggplot(p3, aes(x=factor(hour), y=sum))+
geom_point()+
geom_smooth()+
scale_x_discrete(limits = c("13:00", "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", "20:00", "21:00", "22:00", "23:00", "0:00", "1:00", "2:00", "3:00", "4:00", "5:00", "6:00", "7:00", "8:00", "9:00", "10:00", "11:00", "12:00"))+
labs(title="Evolution of number of bike moves grouped by hour, for top 20 stations, \nJuly 14 - July 15",
caption="Source: Vélib API, Open Street Maps",
x="Hour",
y="Number of bike moves by hour")+
theme_bw()+
theme(axis.text.x = element_text(size = 6))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
This suggests that the peak time to be getting a bike at these
stations is at 23:00, though this likely undercounts the 22:00 hour,
since data collection only started at 22:30.
Finally, we can also look at calculate usage metrics as a share of total bike capacity at each station
capacity_df <- dbGetQuery(db, "SELECT *
FROM station_data
LEFT JOIN (SELECT *
FROM velib_change)
ON station_id=change_station_id
")
capacity_df$usage <- capacity_df$docks_change/capacity_df$capacity*100
print(paste0("The correlation between number of bike changes and station usage is ", cor(capacity_df$bikes_change, capacity_df$usage, method="pearson", use="complete.obs")))
## [1] "The correlation between number of bike changes and station usage is 0.73914451139539"
ggmap(map)+
geom_point(data=capacity_df, mapping=aes(x=lon, y=lat, color=usage))+
scale_color_distiller(palette = "YlOrRd", direction = 1)+
labs(title="Map of the Vélib stations by number of bike usage as a share of capacity, \nSat Jan 14 22:30- Sun Jan 15 1:00",
caption="Source: Vélib API, Open Street Maps",
color="Bike usage")+
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank())
## Warning: Removed 100 rows containing missing values (`geom_point()`).
There is a high correlation between the two variables, but usage
does suggest a slightly different picture, since some smaller stations
may have been constrained by the number of bikes they could offer to
riders. A number of stations experiencing their total capacity of bikes
coming and going in just those 2.5 hours! They still tend to be in the
center and towards the East, and you can even somewhat see the angle of
the Grands Boulevards stations in the North.
I’ll conclude by using usage to see if there any trends by longitude or latitude.
ggplot(data=capacity_df, aes(x=lon, y=usage))+
geom_point(aes(color=usage))+
geom_smooth(color="#bc5090")+
labs(title="Station usage by longitude, \nSat Jan 14 22:30- Sun Jan 15 1:00",
caption="Source: Vélib API, Open Street Maps",
color="Bike usage",
x="Longitude",
y="usage")+
theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 47 rows containing missing values (`geom_point()`).
ggplot(data=capacity_df, aes(x=lat, y=usage))+
geom_point(aes(color=usage))+
coord_flip()+
geom_smooth(color="#bc5090")+
labs(title="Station usage by latitude, \nSat Jan 14 22:30- Sun Jan 15 1:00",
caption="Source: Vélib API, Open Street Maps",
color="Bike usage",
x="Latitude",
y="Usage")+
theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
For both latitude and longitude, it is clear that central
locations were more popular on Saturday night.
This project has therefore more or less confirmed priors about how centrality in Paris would affect usage of bikes. And thought our method of sampling likely undercounts total rides by only campling status changes every minute, it still finds a sizable number of trips conducted on Saturday night.
sum(velib_change$docks_change, na.rm=TRUE)
## [1] 187479
Indeed, the total number of bike changes was over 8000, which suggests at least 4000 trips were conducted over the two and a half hours, or more than 27 trips/minute.
#Looking just at 6 PM- AM